import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
# Step 1: Load and Prepare Data
df1 = pd.read_csv("RS_Session_263_AU_102_A.csv")
df1
| Category | 2022 | 2023 | |
|---|---|---|---|
| 0 | 2 Wheelers | 631464 | 859376 |
| 1 | 3 Wheelers | 352710 | 582793 |
| 2 | Commercial Vehicles | 2649 | 5673 |
| 3 | Passenger Vehicles | 38240 | 82105 |
# Step 3: Visualization
# Create bar plots to visualize sales distribution across categories for each year
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
# Bar plot for 2022
sns.barplot(x='2022', y='Category', data=df1, ax=axes[0])
axes[0].set_title('Vehicle Sales in 2022')
axes[0].set_xlabel('Number of Vehicles Sold')
axes[0].set_ylabel('Category')
# Bar plot for 2023
sns.barplot(x='2023', y='Category', data=df1, ax=axes[1])
axes[1].set_title('Vehicle Sales in 2023')
axes[1].set_xlabel('Number of Vehicles Sold')
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
# Data
categories = ['2 Wheelers', '3 Wheelers', 'Commercial Vehicles', 'Passenger Vehicles']
sales_2022 = [631464, 352710, 2649, 38240]
sales_2023 = [859376, 582793, 5673, 82105]
# Pie chart for 2022
plt.figure(figsize=(10, 6))
plt.pie(sales_2022, labels=categories, autopct='%1.1f%%', startangle=140)
plt.title('Vehicle Sales Distribution in 2022')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# Pie chart for 2023
plt.figure(figsize=(10, 6))
plt.pie(sales_2023, labels=categories, autopct='%1.1f%%', startangle=140)
plt.title('Vehicle Sales Distribution in 2023')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# Remove 'Category' column for clustering
df1_cluster = df1.drop('Category', axis=1)
# Step 2: Feature Scaling (if needed)
# Standardize the features before applying clustering
scaler = StandardScaler()
df1_scaled = scaler.fit_transform(df1_cluster)
# Step 3: Apply K-means Clustering
# Assume we want to find 3 clusters based on the sales data
kmeans = KMeans(n_clusters=3, random_state=42)
df1['Cluster'] = kmeans.fit_predict(df1_scaled)
# Step 4: Visualization of Clusters
plt.figure(figsize=(10, 6))
# Scatter plot for 2022 sales
plt.scatter(df1['2022'], df1['Category'], c=df1['Cluster'], cmap='viridis', s=100, alpha=0.8, label='2022')
# Scatter plot for 2023 sales
plt.scatter(df1['2023'], df1['Category'], c=df1['Cluster'], cmap='viridis', s=100, marker='s', alpha=0.8, label='2023')
plt.xlabel('Number of Vehicles Sold')
plt.ylabel('Vehicle Category')
plt.title('Market Segmentation based on Vehicle Sales')
plt.legend()
plt.show()
C:\Users\user\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1426: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
df2 = pd.read_csv("Ev_data1.csv")
df2.head()
| type | financial_year | month | num_vehicles | |
|---|---|---|---|---|
| 0 | E-2 Wheelers | 2017-18 | Apr | 96 |
| 1 | E-2 Wheelers | 2017-18 | May | 91 |
| 2 | E-2 Wheelers | 2017-18 | Jun | 137 |
| 3 | E-2 Wheelers | 2017-18 | Jul | 116 |
| 4 | E-2 Wheelers | 2017-18 | Aug | 99 |
# Standardize the 'num_vehicles' column using StandardScaler
scaler = StandardScaler()
df2_scaled = scaler.fit_transform(df2[['num_vehicles']])
# Step 4: Apply K-means Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)
C:\Users\user\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1426: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
# Step 5: Visualization of Clusters
plt.figure(figsize=(12, 8))
<Figure size 1200x800 with 0 Axes>
<Figure size 1200x800 with 0 Axes>
# Scatter plot for each cluster
sns.scatterplot(data=df2, x='financial_year', y='num_vehicles')
plt.title('Market Segmentation of E-2 Wheelers based on Monthly Sales')
plt.xlabel('Financial Year')
plt.ylabel('Number of Vehicles Sold')
plt.legend(title='Cluster')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
# Scatter plot for each cluster
sns.scatterplot(data=df2, x='month', y='num_vehicles')
plt.title('Market Segmentation of E-2 Wheelers based on Monthly Sales')
plt.xlabel('Financial Year')
plt.ylabel('Number of Vehicles Sold')
plt.legend(title='Cluster')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
# Fit polynomial regression
degree = 3 # Degree of polynomial
x = np.arange(len(df2)) # Assuming x-axis as index or time
y = df2['num_vehicles'].values
coefficients = np.polyfit(x, y, degree)
polynomial = np.poly1d(coefficients)
# Step 2: Visualization of Polynomial Regression
plt.figure(figsize=(12, 8))
plt.scatter(x, y, color='blue', label='Actual Data')
plt.plot(x, polynomial(x), color='red', label='Polynomial Regression (Degree 3)')
plt.title('Polynomial Regression Trend of E-2 Wheelers Sales')
plt.xlabel('Time (Months)')
plt.ylabel('Number of Vehicles Sold')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Prepare data for clustering
# Extract relevant numerical features (assuming only 'num_vehicles' for now)
X = df2[['num_vehicles']]
# Apply K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42) # Assuming 4 clusters
df2['cluster'] = kmeans.fit_predict(X)
# Visualize clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df2, x=df2.index, y='num_vehicles', hue='cluster', palette='viridis', legend='full')
plt.title('K-means Clustering of E-2 Wheelers Sales')
plt.xlabel('Time (Months)')
plt.ylabel('Number of Vehicles Sold')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
C:\Users\user\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1426: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
df3 = pd.read_csv("RS_Session_256_AU_95_C.csv")
df3.head()
| State Name | Two Wheeler | Three Wheeler | Four Wheeler | Goods Vehicles | Public Service Vehicle | Special Category Vehicles | Ambulance/Hearses | Construction Equipment Vehicle | Other | Grand Total | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Andaman and Nicobar Island | 1 | 30.0 | 81 | NaN | 40.0 | NaN | NaN | NaN | 7.0 | 159 |
| 1 | Arunachal Pradesh | 14 | NaN | 5 | NaN | NaN | NaN | NaN | NaN | 1.0 | 20 |
| 2 | Assam | 721 | 47041.0 | 161 | 7.0 | 15.0 | NaN | NaN | NaN | 2.0 | 47947 |
| 3 | Bihar | 5003 | 59079.0 | 114 | 11.0 | 26.0 | NaN | NaN | NaN | 8.0 | 64241 |
| 4 | Chandigarh | 298 | 1410.0 | 182 | NaN | 40.0 | NaN | NaN | NaN | 1.0 | 1931 |
# Sort the data by 'Two Wheeler' values for better visualization
df3.sort_values(by='Two Wheeler', ascending=False, inplace=True)
# Plotting
plt.figure(figsize=(12, 8))
plt.barh(df3['State Name'], df3['Two Wheeler'], color='skyblue')
plt.xlabel('Number of Two Wheelers')
plt.title('Demand for Two Wheelers Across States')
plt.gca().invert_yaxis() # Invert y-axis to display highest demand at the top
plt.tight_layout()
plt.show()
# Sort the data by 'Three Wheeler' values for better visualization
df3.sort_values(by='Three Wheeler', ascending=False, inplace=True)
# Plotting
plt.figure(figsize=(12, 8))
plt.barh(df3['State Name'], df3['Three Wheeler'], color='skyblue')
plt.xlabel('Number of Three Wheelers')
plt.title('Demand for Three Wheelers Across States')
plt.gca().invert_yaxis() # Invert y-axis to display highest demand at the top
plt.tight_layout()
plt.show()
# Sort the data by 'Four Wheeler' values for better visualization
df3.sort_values(by='Four Wheeler', ascending=False, inplace=True)
# Plotting
plt.figure(figsize=(12, 8))
plt.barh(df3['State Name'], df3['Four Wheeler'], color='skyblue')
plt.xlabel('Number of Four Wheelers')
plt.title('Demand for Four Wheelers Across States')
plt.gca().invert_yaxis() # Invert y-axis to display highest demand at the top
plt.tight_layout()
plt.show()
df4 = pd.read_csv("EV_industry.csv")
df4.head()
| category | financial_year | num_vehicles | |
|---|---|---|---|
| 0 | E-2 Wheelers | 2017-18 | 1981 |
| 1 | E-2 Wheelers | 2018-19 | 27478 |
| 2 | E-2 Wheelers | 2019-20 | 26512 |
| 3 | E-2 Wheelers | 2020-21 | 44294 |
| 4 | E-2 Wheelers | 2021-22 | 249615 |
# Convert 'financial_year' to numeric values for easier manipulation
df4['year'] = pd.to_numeric(df4['financial_year'].str.split('-').str[0])
# Separate data for each category
e2_data = df4[df4['category'] == 'E-2 Wheelers']
e3_data = df4[df4['category'] == 'E-3 Wheelers']
e4_data = df4[df4['category'] == 'E-4 Wheelers']
ebus_data = df4[df4['category'] == 'E-Buses']
def polynomial_regression(x, y, degree):
poly_features = PolynomialFeatures(degree=degree)
x_poly = poly_features.fit_transform(x.values.reshape(-1, 1))
model = LinearRegression()
model.fit(x_poly, y)
return model.coef_
# Perform polynomial regression and calculate slopes (coefficients) for each category
degree = 2 # Degree of polynomial (quadratic)
e2_slope = polynomial_regression(e2_data['year'], e2_data['num_vehicles'], degree)[2]
e3_slope = polynomial_regression(e3_data['year'], e3_data['num_vehicles'], degree)[2]
e4_slope = polynomial_regression(e4_data['year'], e4_data['num_vehicles'], degree)[2]
ebus_slope = polynomial_regression(ebus_data['year'], ebus_data['num_vehicles'], degree)[2]
# Plotting the trends
plt.figure(figsize=(10, 6))
plt.plot(e2_data['year'], e2_data['num_vehicles'], marker='o', label='E-2 Wheelers')
plt.plot(e3_data['year'], e3_data['num_vehicles'], marker='o', label='E-3 Wheelers')
plt.plot(e4_data['year'], e4_data['num_vehicles'], marker='o', label='E-4 Wheelers')
plt.plot(ebus_data['year'], ebus_data['num_vehicles'], marker='o', label='E-Buses')
plt.title('Trends in Electric Vehicle Categories')
plt.xlabel('Year')
plt.ylabel('Number of Vehicles')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Print the slopes
print(f"Slopes (Coefficient of x^2) for each category:")
print(f"E-2 Wheelers: {e2_slope:.2f}")
print(f"E-3 Wheelers: {e3_slope:.2f}")
print(f"E-4 Wheelers: {e4_slope:.2f}")
print(f"E-Buses: {ebus_slope:.2f}")
Slopes (Coefficient of x^2) for each category: E-2 Wheelers: 55179.25 E-3 Wheelers: 22230.18 E-4 Wheelers: 3555.82 E-Buses: 98.55
df5 = pd.read_csv("bike_dataset.csv")
df5.head()
| model_name | price | mileage | type_of_bike | weight_in_kg | acceleration_speed | top_speed | |
|---|---|---|---|---|---|---|---|
| 0 | Gravton Motors Quanta | 99000 | 320.0 | Electric Bike | 100 | 4.2 | 70.0 |
| 1 | Simple Energy One | 109999 | 236.0 | Electric Bike | 110 | 3.6 | 100.0 |
| 2 | Okaya Classiq | 69900 | 200.0 | Electric Bike | 95 | NaN | 25.0 |
| 3 | Oben Electric Rorr | 102999 | 200.0 | Electric Bike | 120 | 3.0 | 100.0 |
| 4 | Ola Electric S1 | 85099 | 181.0 | Electric Bike | 121 | 2.9 | 116.0 |
# Plotting
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df5, x='mileage', y='price', hue='type_of_bike', style='model_name', s=100)
# Labels and title
plt.title('Price vs Mileage for Different Bike Types')
plt.xlabel('Mileage (km)')
plt.ylabel('Price (INR)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Show plot
plt.tight_layout()
plt.show()
C:\Users\user\AppData\Local\Temp\ipykernel_16828\4177900366.py:12: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. plt.tight_layout()
# Plotting
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df5, x='weight_in_kg', y='price', hue='type_of_bike', style='model_name', s=100)
# Labels and title
plt.title('Price vs Weight for Different Bike Types')
plt.xlabel('Weight (kg)')
plt.ylabel('Price (INR)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Show plot
plt.tight_layout()
plt.show()
C:\Users\user\AppData\Local\Temp\ipykernel_16828\3190787227.py:12: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. plt.tight_layout()
# Filter out rows where acceleration_speed is 'N/A'
df5 = df5[df5['acceleration_speed'] != 'N/A']
# Convert acceleration_speed to numeric
df5['acceleration_speed'] = pd.to_numeric(df5['acceleration_speed'])
# Plotting
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df5, x='acceleration_speed', y='price', hue='type_of_bike', style='model_name', s=100)
# Labels and title
plt.title('Price vs Acceleration Speed for Different Bike Types')
plt.xlabel('Acceleration Speed (seconds to 60 km/h)')
plt.ylabel('Price (INR)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Show plot
plt.tight_layout()
plt.show()
C:\Users\user\AppData\Local\Temp\ipykernel_16828\1924144846.py:12: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. plt.tight_layout()
# Filter out rows where top_speed is 'N/A'
df5 = df5[df5['top_speed'] != 'N/A']
# Convert top_speed to numeric
df5['top_speed'] = pd.to_numeric(df5['top_speed'])
# Plotting
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df5, x='top_speed', y='price', hue='type_of_bike', style='model_name', s=100)
# Labels and title
plt.title('Price vs Top Speed for Different Bike Types')
plt.xlabel('Top Speed (km/h)')
plt.ylabel('Price (INR)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# Show plot
plt.tight_layout()
plt.show()
C:\Users\user\AppData\Local\Temp\ipykernel_16828\1183808907.py:12: UserWarning: Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. plt.tight_layout()
# Handle missing values
df5['acceleration_speed'] = df5['acceleration_speed'].replace('N/A', np.nan).astype(float)
df5['acceleration_speed'].fillna(df5['acceleration_speed'].mean(), inplace=True)
# Remove the 'model_name' column for analysis
df5_no_model = df5.drop(columns=['model_name'])
# Plotting
plt.figure(1, figsize=(15, 7))
n = 0
for cols in ['price', 'mileage', 'weight_in_kg', 'acceleration_speed', 'top_speed']:
n += 1
plt.subplot(1, 5, n)
sns.set(style='whitegrid')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
sns.violinplot(x=cols, y='type_of_bike', data=df5_no_model)
plt.ylabel("Type of Bike" if n == 1 else '')
plt.title(f'Violin Plot of {cols.capitalize()}')
plt.show()
df6 = pd.read_csv("bike_buyers.csv")
df6.head()
| ID | Marital Status | Gender | Income | Children | Education | Occupation | Home Owner | Cars | Commute Distance | Region | Age | Purchased Bike | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12496 | Married | Female | 40000.0 | 1.0 | Bachelors | Skilled Manual | Yes | 0.0 | 0-1 Miles | Europe | 42.0 | No |
| 1 | 24107 | Married | Male | 30000.0 | 3.0 | Partial College | Clerical | Yes | 1.0 | 0-1 Miles | Europe | 43.0 | No |
| 2 | 14177 | Married | Male | 80000.0 | 5.0 | Partial College | Professional | No | 2.0 | 2-5 Miles | Europe | 60.0 | No |
| 3 | 24381 | Single | NaN | 70000.0 | 0.0 | Bachelors | Professional | Yes | 1.0 | 5-10 Miles | Pacific | 41.0 | Yes |
| 4 | 25597 | Single | Male | 30000.0 | 0.0 | Bachelors | Clerical | No | 0.0 | 0-1 Miles | Europe | 36.0 | Yes |
# Remove rows with any missing values
df6 = df6.dropna()
# Convert categorical variables to numerical values using one-hot encoding
df6_encoded = pd.get_dummies(df6, drop_first=True)
# Calculate the correlation matrix
correlation_matrix = df6_encoded.corr()
# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()
# Convert numeric columns to correct dtype
df6['Income'] = pd.to_numeric(df6['Income'])
df6['Cars'] = pd.to_numeric(df6['Cars'])
df6['Age'] = pd.to_numeric(df6['Age'])
# Plot Violin Plot
plt.figure(figsize=(12, 8))
sns.violinplot(x='Marital Status', y='Income', data=df6, hue='Purchased Bike', split=True, inner='quart', linewidth=1.5)
plt.title('Income Distribution by Marital Status and Bike Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Income')
plt.legend(title='Purchased Bike', loc='best')
plt.show()
# Plot Violin Plot
plt.figure(figsize=(12, 8))
sns.violinplot(x='Gender', y='Income', data=df6, hue='Purchased Bike', split=True, inner='quart', linewidth=1.5)
plt.title('Income Distribution by Marital Status and Bike Purchase')
plt.xlabel('Gender')
plt.ylabel('Income')
plt.legend(title='Purchased Bike', loc='best')
plt.show()
# Pair Plot
sns.pairplot(df6)
plt.suptitle('Pair Plot of Numerical Variables', y=1.02)
plt.show()
# Filter for married men who bought a bike or not
married_men = df6[(df6['Marital Status'] == 'Married') & (df6['Gender'] == 'Male')]
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Income', y='Age', hue='Purchased Bike', data=df6, palette='Set1', s=100)
plt.title('Income vs Age: Bike Purchase')
plt.xlabel('Income')
plt.ylabel('Age')
plt.legend(title='Purchased Bike', loc='upper right')
plt.show()
# Scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Income', y='Gender', hue='Purchased Bike', data=df6, palette='Set1', s=100)
plt.title('Scatter Plot: Income vs Gender')
plt.xlabel('Income')
plt.ylabel('Gender')
plt.legend(title='Purchased Bike', loc='upper right')
plt.show()
# Calculate correlations
corr_matrix = df6.corr()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[412], line 2 1 # Calculate correlations ----> 2 corr_matrix = df6.corr() File ~\anaconda3\Lib\site-packages\pandas\core\frame.py:10054, in DataFrame.corr(self, method, min_periods, numeric_only) 10052 cols = data.columns 10053 idx = cols.copy() > 10054 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 10056 if method == "pearson": 10057 correl = libalgos.nancorr(mat, minp=min_periods) File ~\anaconda3\Lib\site-packages\pandas\core\frame.py:1838, in DataFrame.to_numpy(self, dtype, copy, na_value) 1836 if dtype is not None: 1837 dtype = np.dtype(dtype) -> 1838 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) 1839 if result.dtype is not dtype: 1840 result = np.array(result, dtype=dtype, copy=False) File ~\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1732, in BlockManager.as_array(self, dtype, copy, na_value) 1730 arr.flags.writeable = False 1731 else: -> 1732 arr = self._interleave(dtype=dtype, na_value=na_value) 1733 # The underlying data was copied within _interleave, so no need 1734 # to further copy if copy=True or setting na_value 1736 if na_value is not lib.no_default: File ~\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1794, in BlockManager._interleave(self, dtype, na_value) 1792 else: 1793 arr = blk.get_values(dtype) -> 1794 result[rl.indexer] = arr 1795 itemmask[rl.indexer] = 1 1797 if not itemmask.all(): ValueError: could not convert string to float: 'Married'
# Plot heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Variables')
plt.show()
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age', y='Income', hue='Purchased Bike', data=df6, palette='Set1', s=100)
plt.title('Scatter Plot: Age vs Income (Purchased Bike)')
plt.xlabel('Age')
plt.ylabel('Income')
plt.legend(title='Purchased Bike', loc='upper right')
plt.show()
# Drop rows with missing values
df6 = df6.dropna()
# Count the occurrences of each combination
counts = df6.groupby(['Cars', 'Purchased Bike']).size().unstack(fill_value=0)
# Plotting
counts.plot(kind='bar', stacked=True)
plt.title('Count of Purchased Bike by Number of Cars')
plt.xlabel('Number of Cars')
plt.ylabel('Count')
plt.legend(title='Purchased Bike', loc='upper right', labels=['No', 'Yes'])
plt.xticks(rotation=0)
plt.show()
# Clean the data: Remove rows where Gender is None or missing
df6 = df6.dropna(subset=['Gender'])
# Count the occurrences of each combination
counts = df6.groupby(['Gender', 'Purchased Bike']).size().unstack(fill_value=0)
# Plotting
counts.plot(kind='bar', stacked=True)
plt.title('Count of Purchased Bike by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Purchased Bike', loc='upper right', labels=['No', 'Yes'])
plt.xticks(rotation=0)
plt.show()
# Count the occurrences of each combination
counts = df6.groupby(['Marital Status', 'Purchased Bike']).size().unstack(fill_value=0)
# Plotting
counts.plot(kind='bar', stacked=True)
plt.title('Count of Purchased Bike by Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.legend(title='Purchased Bike', loc='upper right', labels=['No', 'Yes'])
plt.xticks(rotation=0)
plt.show()
# Count the occurrences of each combination
counts = df6.groupby(['Children', 'Purchased Bike']).size().unstack(fill_value=0)
# Plotting
counts.plot(kind='bar', stacked=True)
plt.title('Count of Purchased Bike by Number of Children')
plt.xlabel('Number of Children')
plt.ylabel('Count')
plt.legend(title='Purchased Bike', loc='upper right', labels=['No', 'Yes'])
plt.xticks(rotation=0)
plt.show()
# Count the occurrences of each combination
counts = df6.groupby(['Occupation', 'Purchased Bike']).size().unstack(fill_value=0)
# Plotting
counts.plot(kind='bar', stacked=True)
plt.title('Count of Purchased Bike by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Count')
plt.legend(title='Purchased Bike', loc='upper right', labels=['No', 'Yes'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Count the occurrences of each combination
counts = df6.groupby(['Home Owner', 'Purchased Bike']).size().unstack(fill_value=0)
# Plotting
counts.plot(kind='bar', stacked=True)
plt.title('Count of Purchased Bike by Home Owner Status')
plt.xlabel('Home Owner')
plt.ylabel('Count')
plt.legend(title='Purchased Bike', loc='upper right', labels=['No', 'Yes'])
plt.xticks(rotation=0)
plt.show()